home
***
CD-ROM
|
disk
|
FTP
|
other
***
search
/
Skunkware 5
/
Skunkware 5.iso
/
src
/
X11
/
wais
/
ir
/
irhash.c
< prev
next >
Wrap
C/C++ Source or Header
|
1995-05-09
|
12KB
|
374 lines
/* WIDE AREA INFORMATION SERVER SOFTWARE:
No guarantees or restrictions. See the readme file for the full standard
disclaimer.
Brewster@think.com
*/
/* The memory hashtables for building an index. */
/* -brewster 5/90 */
/* Change log:
* $Log: irhash.c,v $
* Revision 1.16 92/03/20 11:04:33 jonathan
* Added code to handle switches for word_pairs and word_postition info.
*
* Revision 1.15 92/02/12 13:26:28 jonathan
* Added "$Log" so RCS will put the log message in the header
*
*/
/* main functions:
* add_word
* finished_add_word
* look_up_word
*
* The idea is to store up a bunch of words before going to disk.
* A word entry points to where it will go on disk, and
* accumulates the entries before doing it.
*
* Some of the policy issues in this file are:
* How much weight should the first occurance of a word in a document get
* over the other occurances. The first occurance should be worth more
* so that words with 3 occurances of "dog" and not "cat"'s should not
* win out over 1 "dog" and 1 "cat" if the question is "Tell me about cats
* torture dogs"
* The extra weight is 5 at this point.
*
*/
/* To Do:
* done: Improve the hashing functions.
* done: stop inserting into hash table after max number have been accumulated
* done: make flush not flush buffers that are too big.
*/
#include <ctype.h>
#include <string.h> /* for strlen(), memset() */
#include "panic.h"
#include "cutil.h"
#include "irfiles.h"
#include "irhash.h"
#include "stoplist.h"
#include "irinv.h"
#ifdef UNIX
#define PRINT_AS_INDEXING true /* also defined in irtfiles.c and irfiles.c */
#else
#define PRINT_AS_INDEXING false
#endif
/* ================================
=== Word Occurance Buffers ===
================================ */
/* Word occurance buffers
* This is a simple memory allocator for use with the word memory hashtable.
* Since the buffers are tiny, this is done as a copy-sweep GC scheme.
* Oh, I long for the storage system of lisp.
*/
unsigned char *first_word_occurance_buffer = NULL; /* allocate blocks out of this */
unsigned char *last_word_occurance_buffer = NULL;
long word_occurance_block_length = 256000; /* maybe this should be larger? */
unsigned char * word_occurance_free_ptr = NULL;
unsigned char *make_word_occurrance_block(size)
long size;
{
/* allocates a word_occurance_block out of the buffers */
/* old way: s_malloc((size_t)size); */
/* returns a pointer to a piece of memory */
if(NULL == first_word_occurance_buffer){
/* initialize it */
first_word_occurance_buffer =
(unsigned char *)s_malloc(MAX(word_occurance_block_length,
sizeof(size_t)+ size));
*(unsigned char **)first_word_occurance_buffer = NULL; /* set the end */
last_word_occurance_buffer = first_word_occurance_buffer;
word_occurance_free_ptr = first_word_occurance_buffer + sizeof(size_t);
}
if((long)word_occurance_free_ptr + size >=
word_occurance_block_length + (long)last_word_occurance_buffer){
/* then allocate a new block */
unsigned char * new_block = (unsigned char *)s_malloc(MAX(word_occurance_block_length,
sizeof(size_t)+ size));
*(unsigned char **)new_block = NULL; /* set the end of the chain */
*(unsigned char **)last_word_occurance_buffer = new_block;
word_occurance_free_ptr = new_block + sizeof(size_t);
last_word_occurance_buffer = new_block;
}
/* allocate away */
{ unsigned char * answer = word_occurance_free_ptr;
word_occurance_free_ptr += size;
return(answer);
}
}
void free_word_occurance_block(block)
unsigned char *block;
{
/* this is not used with the new scheme, but is here in case
malloc is a win on some systems */
/* old way s_free(block); */
}
static void flush_word_occur_bufs_internal
_AP((unsigned char* head_of_list));
static void flush_word_occur_bufs_internal(head_of_list)
unsigned char* head_of_list;
/* frees all word occurance buffers. This should be done with care */
{
while(1){
unsigned char * next_block;
if(NULL == head_of_list)
break;
next_block = *(unsigned char **)head_of_list;
s_free(head_of_list);
head_of_list = next_block;
}
}
void flush_word_occurance_buffers()
{
/* frees all word occurance buffers. This should be done with care */
flush_word_occur_bufs_internal(first_word_occurance_buffer);
first_word_occurance_buffer = NULL;
word_occurance_free_ptr = NULL;
last_word_occurance_buffer = NULL;
}
/* ---------------------------------------------------- */
static hash_entry* look_up_word _AP((char* word,hashtable*
the_word_memory_hashtable));
static hash_entry*
look_up_word(word,the_word_memory_hashtable)
char* word;
hashtable* the_word_memory_hashtable;
{
hash_entry * answer = get_hash(word, the_word_memory_hashtable);
if(NULL != answer)
return(answer);
else{
hash_entry wrd_entry;
answer = put_hash(word, the_word_memory_hashtable, &wrd_entry);
answer->number_of_occurances = 0;
answer->memory_ptr =
make_word_occurrance_block(WORD_MEMORY_INIT_BLOCK_SIZE);
answer->current_memory_ptr = answer->memory_ptr;
answer->memory_size = WORD_MEMORY_INIT_BLOCK_SIZE;
answer->current_doc_id = 0;
return(answer);
}
}
static unsigned char add_weight _AP((long current_weight,long new_weight));
static unsigned char
add_weight(current_weight,new_weight)
long current_weight;
long new_weight;
/* add a new weight to the existing one */
{
/* this should be smarter than this, like doing the log or something */
if(127 < (current_weight + new_weight)){
/* the max char. should be 255, but does not work on all compilers */
return(127);
}
else{
return(current_weight + new_weight);
}
}
static unsigned char* more_memory _AP((unsigned char* current_memory_ptr,
long current_memory_size,
long new_size));
static unsigned char* more_memory(current_memory_ptr,current_memory_size,new_size)
unsigned char* current_memory_ptr;
long current_memory_size;
long new_size;
/* Allocates more memory for a hash_entry. It transfers all the bytes
* from the old to the new and then returns the new.
*/
{
unsigned char* new_memory = NULL;
if(current_memory_size > new_size){
panic("trying to contract a hash_entry block. This is not right\n");
}
new_memory = make_word_occurrance_block(new_size);
if(NULL == new_memory){
panic("Out of memory.");
}
memset(new_memory, 0, new_size);
memmove(new_memory, current_memory_ptr, (size_t)current_memory_size);
return(new_memory);
}
static long more_memory_size _AP((long current_size,
long number_of_occurances));
static long more_memory_size(current_size,number_of_occurances)
long current_size;
long number_of_occurances;
/* This is pretty important to get right. This is a place holder */
{
return(MAX(2 * current_size, WORD_MEMORY_INIT_BLOCK_SIZE));
}
long write_bytes_to_memory(value,size,ptr)
long value;
long size;
unsigned char* ptr;
{
/* writes the number into memory lsb first.
returns the number of bytes written */
long i;
long original_value = value;
if(size < 0) /* paranoia */
panic("attempting to write a negative number of bytes");
ptr += size; /* start at the end of the block and write backwards */
for (i = 0; i < size; i++){
ptr--;
*ptr = (unsigned char)(value & 0xFF);
value = value >> 8;
}
if(value != 0)
panic("In a call to write_bytes_to_memory, the value %ld can not be represented in %ld bytes", original_value, size);
return(size);
}
/* adds a word to the hashtable. Returns the 0 if successful.
See irext.h for more documentation.
*/
long add_word(word, char_pos, line_pos,
weight, doc_id, date, word_pair, db, word_position)
char *word; /* the word to be indexed, this could be a
word pair. If NULL there are no more words
to be indexed */
long char_pos; /* the position of the start of the
word */
long line_pos; /* this is passed for the best
section calculation */
long weight; /* how important the word looks
syntactically (such as is it bold)
NOT used by signature system */
long doc_id; /* current document, this will never be 0 */
time_t date; /* display day of this document, 0 if not known */
long word_pair;
database* db; /* database to insert the document */
boolean word_position; /* whether to have multiple entries for words in a document */
{
/* look up the word in the hashtable */
/* creates it if necessary */
hash_entry* wrd_entry;
hashtable * the_word_memory_hashtable = db->the_word_memory_hashtable;
/* printf("Word: '%s' doc_id: %ld, pos: %ld, weight: %ld\n",
word, doc_id, char_pos, weight); */
if(NULL == db->the_word_memory_hashtable){
panic("The memory word hashtable is not defined.");
}
/* if we have indexed enough words flush the memory copies to disk. */
if(db->number_of_words_in_hashtable == db->flush_after_n_words)
flush_memory_hashtable_to_disk(db, false);
wrd_entry = look_up_word(word, the_word_memory_hashtable);
wrd_entry->number_of_occurances ++;
if(wrd_entry->number_of_occurances > MAX_OCCURANCES){
/* do nothing. we have enough of that word */
}
else{
/* we have a word to add */
db->number_of_words_in_hashtable ++;
if(word_position || doc_id != wrd_entry->current_doc_id){
/* then we have a new doc_id to add to the memory block */
/* check to see if we need more memory */
if((wrd_entry->memory_size -
(wrd_entry->current_memory_ptr -
wrd_entry->memory_ptr)
<
INDEX_ELEMENT_SIZE)){
/* we need more memory. this makes more and frees the old*/
unsigned char* old_memory_ptr = wrd_entry->memory_ptr;
long new_size =
more_memory_size(wrd_entry->memory_size,
wrd_entry->number_of_occurances);
/* cprintf(PRINT_AS_INDEXING, "Get more memory %ld bytes for %s\n", new_size, word); */
wrd_entry->memory_ptr =
more_memory(wrd_entry->memory_ptr, wrd_entry->memory_size,
new_size);
wrd_entry->current_memory_ptr =
wrd_entry->memory_ptr + /* new offset */
(wrd_entry->current_memory_ptr - old_memory_ptr);
/* just being paranoid... no longer illegal
if(wrd_entry->current_memory_ptr == wrd_entry->memory_ptr)
panic("After allocating more memory, the size went to 0");
*/
wrd_entry->memory_size = new_size;
} /* finished making more memory */
/* add away */
wrd_entry->current_memory_ptr +=
write_bytes_to_memory(doc_id, DOCUMENT_ID_SIZE,
wrd_entry->current_memory_ptr);
wrd_entry->current_memory_ptr +=
write_bytes_to_memory(char_pos,
CHARACTER_POSITION_SIZE,
wrd_entry->current_memory_ptr);
wrd_entry->current_memory_ptr +=
write_bytes_to_memory(weight +
/* add 5 since for the first one */
(doc_id != wrd_entry->current_doc_id)?5:0,
WEIGHT_SIZE,
wrd_entry->current_memory_ptr);
wrd_entry->current_doc_id = doc_id;
}
else{
/* The word is already there,
* just increment the weight in the record.
* This will change when/if position information is kept (for proximity).
*/
if(wrd_entry->current_memory_ptr == wrd_entry->memory_ptr){
panic("Memory hashtable error. Recorded doc_id %ld, current doc_id %ld\n",
wrd_entry->current_doc_id, doc_id);
}
*(wrd_entry->current_memory_ptr - 1) =
add_weight(*(wrd_entry->current_memory_ptr - 1), weight);
}
}
return(0L);
}
void add_stop_words(the_word_memory_hashtable)
hashtable *the_word_memory_hashtable;
/* add the stop words to the hashtable. this must be done before
adding other words */
{
init_stop_list();
while(true){
char *word = next_stop_word();
hash_entry* wrd_entry;
if(NULL == word)
break;
wrd_entry = look_up_word(word, the_word_memory_hashtable);
wrd_entry->number_of_occurances = STOP_WORD_FLAG;
}
}